In [1]:
%matplotlib inline
In [2]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from mpl_toolkits.mplot3d import Axes3D
In [3]:
from mountaincar import MountainCar, MountainCarViewer
In [4]:
# stop if any overflow encountered
np.seterr(over='raise');
In [5]:
car = MountainCar()

Plot functions

In [6]:
def vec_plot(p):
    '''Give an array `p` of probabilities, plots the Q-values direction vector field.'''
    p_max = np.argmax(p, axis=2)
    
    # define arrow direction
    U = p_max - 1
    V = np.zeros((ngrid_pos, ngrid_speed))

    plt.quiver(U, V, alpha=1, scale=1.8, units='xy')
    plt.xlim(-1, 20)
    plt.xticks(())
    plt.ylim(-1, 20)
    plt.yticks(())
    plt.xlabel('position $x$')
    plt.ylabel('speed $\dot x$')
    plt.title('Q-values direction vector field (arrows show the direction of applied force)')

    plt.show()
In [7]:
def plot3D(q):
    '''Given q-values `q`, plots in 3D all possibles states.'''
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    # generate all parameters combination for states
    x, y = np.meshgrid(x_pos, y_speed)
    ax.plot_wireframe(x, y, q, color='grey')
    ax.set_xlabel('position')
    ax.set_ylabel('speed')
    ax.set_zlabel('max q')
    
    plt.show()

Helper coordinates

In [8]:
# grid discretisation
ngrid_pos = 20
ngrid_speed = 20
In [9]:
# grid corners
int_pos = -150, 30
int_speed = -15, 15
In [10]:
# prepare all parameters combination for states
x_pos, center_dist_pos = np.linspace(int_pos[0], int_pos[1], ngrid_pos, retstep=True)
y_speed, center_dist_speed = np.linspace(int_speed[0], int_speed[1], ngrid_speed, retstep=True)
y_speed_t = y_speed.reshape(-1, 1)

Helper functions

In [11]:
def activity(s):
    '''Given a state `s`, returns local continuous activity response.'''
    return np.exp(- ((x_pos - s[0]) / center_dist_pos) ** 2 - ((y_speed_t - s[1]) / center_dist_speed) ** 2).T
In [12]:
def Q(s, a, w):
    '''Given a state `s`, an action `a` and weights `w`, returns corresponding q-values.'''
    return np.sum(w[:, :, a] * activity(s))
In [13]:
def softmax(x, tau):
    '''Given an array `x` and a temperature parameter `tau`, returns robust softmax applied on the array.'''
    delta = (np.max(x) - np.min(x))
    
    # all zero mean 1/len(x) chance for each action
    if np.isclose(delta, 0):
        return np.ones_like(x) / len(x)
    
    # rescale to avoid overflow issues
    xp = (np.array(x) - np.min(x)) / delta
    
    e_x = np.exp(xp / tau)
    return e_x / e_x.sum()
In [14]:
qs = np.array([0.3, 0.2, 0.1])
x = np.logspace(-1, 3, 40)
y = np.array(list(zip(*[softmax(qs, xi) for xi in x])))

plt.xscale("log")
plt.xlabel("tau")
plt.ylabel("probability")
plt.plot(x,y[0])
plt.plot(x,y[1])
plt.plot(x,y[2])
plt.show()
In [15]:
tau_max = 5
tau_min = 1e-1
tau_steps = 100
x = range(tau_steps)
y = [tau_max * np.exp((1 / tau_steps) * np.log(tau_min / tau_max)) ** i for i in x]
plt.plot(x,y)
Out[15]:
[<matplotlib.lines.Line2D at 0x113d9bdd8>]

SARSA algo

In [16]:
def sarsa(n_epi=100,
          tau_max=1, # exploration/expoitation parameter
          tau_min=1e-1,
          expire=100,
          gamma=0.95, 
          lmbda = 0.05, 
          eta = 0.01,
          fill = 0,
          show = False,
          limit = 2000,
          dt=0.01, 
          steps=100):
    '''Given hyperparameters, run the sarsa algorithm on `n_epi` episode and returns weight, probabilities and latency history.'''
    
    # store latency and probabilities
    probs = []
    latencies = []

    # decreasing exponential coeficient for tau
    tau_coef = np.exp((1 / expire) * np.log(tau_min / tau_max))
    tau = tau_max
    
    # initial weights
    w = np.ones((ngrid_pos, ngrid_speed, 3)) * fill

    i = 0
    while i < n_epi:
        print("------------------------")
        print("episode :", i)
        print("with tau :", tau)

        # null eligibility traces
        e = np.zeros((ngrid_pos, ngrid_speed, 3))

        # initial state
        car.reset()
        s0 = car.x, car.x_d

        # initial random action
        a0 = np.random.randint(3)

        j = 0
        while j < limit:
            j += 1

            # take action, simulate and retrieve new state
            car.apply_force(a0 - 1)
            car.simulate_timesteps(steps, dt)
            s1 = car.x, car.x_d

            # compute probabilities for each action and choose among them
            p = softmax([Q(s1, a, w) for a in range(3)], tau)
            a1 = np.random.choice(range(3), p=p)

            # decrease eligibility traces and increase selected action
            e *= gamma * lmbda
            e[:, :, a0] += activity(s0)[:, :]

            # update weights accordingly
            # the factor j / 1000 has been added after discussion with TAs in order to increase convergence speed
            delta = car.R + gamma * Q(s1, a1, w) - Q(s0, a0, w) - j / 1000
            w += eta * delta * e

            # propagate next action and state
            a0 = a1
            s0 = s1

            if car.R > 0.0:
                i += 1
                
                tau *= tau_coef
        
                prob = np.array([[softmax([Q((x, y), a, w) for a in range(3)], tau) for x in x_pos] for y in y_speed])
                max_action = -np.max([[[Q((x, y), a, w) for a in range(3)] for x in x_pos] for y in y_speed], axis=2)

                if (show):
                    vec_plot(prob)
                    plot3D(max_action)
                    plt.show()

                probs.append(prob)
                latencies.append(car.t)
                
                print('reward obtained at t =', car.t)
                break
        
    return w, probs, latencies
In [17]:
w, probs, latencies = sarsa(show=True)
------------------------
episode : 0
with tau : 1
reward obtained at t = 856.0
------------------------
episode : 1
with tau : 0.977237220956
------------------------
episode : 1
with tau : 0.977237220956
------------------------
episode : 1
with tau : 0.977237220956
------------------------
episode : 1
with tau : 0.977237220956
reward obtained at t = 1726.0
------------------------
episode : 2
with tau : 0.954992586021
reward obtained at t = 1307.0
------------------------
episode : 3
with tau : 0.933254300797
reward obtained at t = 453.0
------------------------
episode : 4
with tau : 0.912010839356
reward obtained at t = 729.0
------------------------
episode : 5
with tau : 0.891250938134
reward obtained at t = 388.0
------------------------
episode : 6
with tau : 0.870963589956
reward obtained at t = 157.0
------------------------
episode : 7
with tau : 0.851138038202
reward obtained at t = 362.0
------------------------
episode : 8
with tau : 0.831763771103
reward obtained at t = 321.0
------------------------
episode : 9
with tau : 0.812830516164
reward obtained at t = 226.0
------------------------
episode : 10
with tau : 0.794328234724
reward obtained at t = 117.0
------------------------
episode : 11
with tau : 0.776247116629
reward obtained at t = 151.0
------------------------
episode : 12
with tau : 0.758577575029
reward obtained at t = 100.0
------------------------
episode : 13
with tau : 0.741310241301
reward obtained at t = 217.0
------------------------
episode : 14
with tau : 0.724435960075
reward obtained at t = 231.0
------------------------
episode : 15
with tau : 0.707945784384
reward obtained at t = 131.0
------------------------
episode : 16
with tau : 0.691830970919
reward obtained at t = 165.0
------------------------
episode : 17
with tau : 0.676082975392
reward obtained at t = 120.0
------------------------
episode : 18
with tau : 0.660693448008
reward obtained at t = 60.0
------------------------
episode : 19
with tau : 0.645654229035
reward obtained at t = 133.0
------------------------
episode : 20
with tau : 0.63095734448
reward obtained at t = 98.0
------------------------
episode : 21
with tau : 0.616595001861
reward obtained at t = 158.0
------------------------
episode : 22
with tau : 0.602559586074
reward obtained at t = 118.0
------------------------
episode : 23
with tau : 0.588843655356
reward obtained at t = 93.0
------------------------
episode : 24
with tau : 0.575439937337
reward obtained at t = 207.0
------------------------
episode : 25
with tau : 0.56234132519
reward obtained at t = 148.0
------------------------
episode : 26
with tau : 0.549540873858
reward obtained at t = 130.0
------------------------
episode : 27
with tau : 0.53703179637
reward obtained at t = 93.0
------------------------
episode : 28
with tau : 0.52480746025
reward obtained at t = 126.0
------------------------
episode : 29
with tau : 0.512861383991
reward obtained at t = 105.0
------------------------
episode : 30
with tau : 0.501187233627
reward obtained at t = 121.0
------------------------
episode : 31
with tau : 0.489778819368
reward obtained at t = 133.0
------------------------
episode : 32
with tau : 0.478630092323
reward obtained at t = 130.0
------------------------
episode : 33
with tau : 0.467735141287
reward obtained at t = 73.0
------------------------
episode : 34
with tau : 0.457088189615
reward obtained at t = 133.0
------------------------
episode : 35
with tau : 0.446683592151
reward obtained at t = 94.0
------------------------
episode : 36
with tau : 0.43651583224
reward obtained at t = 104.0
------------------------
episode : 37
with tau : 0.426579518802
reward obtained at t = 66.0
------------------------
episode : 38
with tau : 0.41686938347
reward obtained at t = 45.0
------------------------
episode : 39
with tau : 0.407380277804
reward obtained at t = 44.0
------------------------
episode : 40
with tau : 0.398107170553
reward obtained at t = 84.0
------------------------
episode : 41
with tau : 0.389045144994
reward obtained at t = 74.0
------------------------
episode : 42
with tau : 0.380189396321
reward obtained at t = 72.0
------------------------
episode : 43
with tau : 0.371535229097
reward obtained at t = 43.0
------------------------
episode : 44
with tau : 0.36307805477
reward obtained at t = 74.0
------------------------
episode : 45
with tau : 0.354813389234
reward obtained at t = 64.0
------------------------
episode : 46
with tau : 0.346736850453
reward obtained at t = 44.0
------------------------
episode : 47
with tau : 0.338844156139
reward obtained at t = 70.0
------------------------
episode : 48
with tau : 0.331131121483
reward obtained at t = 69.0
------------------------
episode : 49
with tau : 0.32359365693
reward obtained at t = 102.0
------------------------
episode : 50
with tau : 0.316227766017
reward obtained at t = 84.0
------------------------
episode : 51
with tau : 0.309029543251
reward obtained at t = 87.0
------------------------
episode : 52
with tau : 0.30199517204
reward obtained at t = 40.0
------------------------
episode : 53
with tau : 0.295120922667
reward obtained at t = 26.0
------------------------
episode : 54
with tau : 0.288403150313
reward obtained at t = 114.0
------------------------
episode : 55
with tau : 0.281838293126
reward obtained at t = 65.0
------------------------
episode : 56
with tau : 0.275422870334
reward obtained at t = 100.0
------------------------
episode : 57
with tau : 0.269153480393
reward obtained at t = 81.0
------------------------
episode : 58
with tau : 0.26302679919
reward obtained at t = 93.0
------------------------
episode : 59
with tau : 0.257039578277
reward obtained at t = 78.0
------------------------
episode : 60
with tau : 0.251188643151
reward obtained at t = 101.0
------------------------
episode : 61
with tau : 0.245470891569
reward obtained at t = 53.0
------------------------
episode : 62
with tau : 0.239883291902
reward obtained at t = 77.0
------------------------
episode : 63
with tau : 0.234422881532
reward obtained at t = 71.0
------------------------
episode : 64
with tau : 0.229086765277
reward obtained at t = 85.0
------------------------
episode : 65
with tau : 0.223872113857
reward obtained at t = 30.0
------------------------
episode : 66
with tau : 0.218776162395
reward obtained at t = 90.0
------------------------
episode : 67
with tau : 0.21379620895
reward obtained at t = 35.0
------------------------
episode : 68
with tau : 0.208929613085
reward obtained at t = 55.0
------------------------
episode : 69
with tau : 0.204173794467
reward obtained at t = 85.0
------------------------
episode : 70
with tau : 0.199526231497
reward obtained at t = 86.0
------------------------
episode : 71
with tau : 0.194984459976
reward obtained at t = 86.0
------------------------
episode : 72
with tau : 0.190546071796
reward obtained at t = 45.0
------------------------
episode : 73
with tau : 0.186208713666
reward obtained at t = 32.0
------------------------
episode : 74
with tau : 0.181970085861
reward obtained at t = 43.0
------------------------
episode : 75
with tau : 0.177827941004
reward obtained at t = 53.0
------------------------
episode : 76
with tau : 0.173780082875
reward obtained at t = 63.0
------------------------
episode : 77
with tau : 0.169824365246
reward obtained at t = 45.0
------------------------
episode : 78
with tau : 0.165958690744
reward obtained at t = 75.0
------------------------
episode : 79
with tau : 0.162181009736
reward obtained at t = 61.0
------------------------
episode : 80
with tau : 0.158489319246
reward obtained at t = 42.0
------------------------
episode : 81
with tau : 0.154881661891
reward obtained at t = 78.0
------------------------
episode : 82
with tau : 0.151356124844
reward obtained at t = 58.0
------------------------
episode : 83
with tau : 0.147910838817
reward obtained at t = 75.0
------------------------
episode : 84
with tau : 0.144543977075
reward obtained at t = 63.0
------------------------
episode : 85
with tau : 0.141253754462
reward obtained at t = 71.0
------------------------
episode : 86
with tau : 0.13803842646
reward obtained at t = 69.0
------------------------
episode : 87
with tau : 0.134896288259
reward obtained at t = 91.0
------------------------
episode : 88
with tau : 0.131825673856
reward obtained at t = 59.0
------------------------
episode : 89
with tau : 0.128824955169
reward obtained at t = 56.0
------------------------
episode : 90
with tau : 0.125892541179
reward obtained at t = 60.0
------------------------
episode : 91
with tau : 0.123026877081
reward obtained at t = 50.0
------------------------
episode : 92
with tau : 0.120226443462
reward obtained at t = 29.0
------------------------
episode : 93
with tau : 0.117489755494
reward obtained at t = 71.0
------------------------
episode : 94
with tau : 0.11481536215
reward obtained at t = 76.0
------------------------
episode : 95
with tau : 0.11220184543
reward obtained at t = 31.0
------------------------
episode : 96
with tau : 0.109647819614
reward obtained at t = 84.0
------------------------
episode : 97
with tau : 0.107151930524
reward obtained at t = 62.0
------------------------
episode : 98
with tau : 0.104712854805
reward obtained at t = 65.0
------------------------
episode : 99
with tau : 0.102329299228
reward obtained at t = 64.0

The following code is used for finding best params and therefore is not required to be reviewed.

In [ ]:
origin_path = "../grid_search/"
dfs = []

for subd, dirs, files in os.walk(origin_path):
    if len(dirs) > 0:
        continue
        
    folder = subd + "/"
    id_ = int(subd[-1])
    
    print("id :",id_)  
    
    links = []
    
    for file in files:
        if file == "log.txt":
            continue
        
        file_arr = file.split("_")
        
        if len(file_arr) > 8:
            tau = "reduce"
        else:
            tau = file_arr[3]
        
        lmbda =  file_arr[5]
        fill = file_arr[7]
        
        links.append({
            "tau": tau,
            "lmbda": float(lmbda),
            "fill": int(fill),
            "link": folder + file
        })
    
    dfs.append(links)
In [ ]:
def search_run(obj):
    return  (obj["fill"] == search["fill"] and 
             obj["lmbda"] == search["lmbda"] and 
             obj["tau"] == search["tau"])
In [ ]:
# imported from https://tonysyu.github.io/plotting-error-bars.html#.WIi79RiZOsw
def errorfill(x, y, yerr, color=None, alpha_fill=0.3, ax=None):
    ax = ax if ax is not None else plt.gca()
    ax.set_xlabel("episodes")
    ax.set_ylabel("iterations to goal")
    if color is None:
        color = ax._get_lines.color_cycle.next()
    if np.isscalar(yerr) or len(yerr) == len(y):
        ymin = y - yerr
        ymax = y + yerr
    elif len(yerr) == 2:
        ymin, ymax = yerr
    ax.plot(x, y, color=color)
    ax.fill_between(x, ymax, ymin, color=color, alpha=alpha_fill)
In [ ]:
def plot_latencies(search_obj, a=None,smooth=10):
    if a is None:
        search = search_obj
        latencies = np.array(
            [
                pd.read_pickle(
                    list(filter(search_run, sims))[0]["link"]
                ).latencies for sims in dfs])
    else:
        latencies = np.array(a)
    lat_m = latencies.mean(axis=0)
    var = np.sqrt(latencies.var(axis=0))
    
    print("mean :", lat_m.mean())
    print("mean last 60 :", lat_m[60:].mean())
    print("min :", latencies.min())
    
    plt.figure(figsize=(10, 8), dpi=80)
    plt.plot(100*[lat_m[60:].mean()], "red")
    errorfill(range(100), lat_m, var, color="gray")
    plt.show()
In [ ]:
a_ = []
for i in range(8):
    w, probs, latencies = sarsa(fill=0, lmbda=0.05, tau_max=1, tau_min=1)
    a_.append(latencies)
In [ ]:
a2 = []
for i in range(8):
    w, probs, latencies = sarsa(fill=0, lmbda=0.05, tau_max=1, tau_min=0.1)
    a2.append(latencies)
In [ ]:
search = {"fill": 0, "lmbda": 0.05, "tau": '0.1'}
   
res = filter(search_run, dfs[0])

print(list(res))

plot_latencies(search, a=a2)
In [ ]:
w, probs, latencies = sarsa(fill=0, lmbda=0.95, tau_max=10, tau_min=0.1, show=True)
In [ ]:
w, probs, latencies = sarsa(fill=1, lmbda=0.95, tau_max=10, tau_min=0.1, show=False)
In [ ]:
plt.plot(latencies)
plt.show()
In [ ]:
lats = []

for i in range(20):
    w, probs, latencies = sarsa(fill=0, lmbda=0.05, tau_max=10, tau_min=0.1, show=False)
    lats.append(latencies)
In [ ]:
lats = np.array(lats)

plt.figure(figsize=(10, 8), dpi=80)

for lat in lats:
    plt.plot(lat, "b", alpha=0.2)

lat_m = lats.mean(axis=0)
var = np.sqrt(lats.var(axis=0))

plt.plot(100*[lat_m[60:].mean()], "red", alpha=1)
errorfill(range(100), lat_m, var, color="black")
    
plt.show()
In [ ]:
i = 0
latencies = np.array([pd.read_pickle(sims[i]["link"]).latencies for sims in dfs])
In [ ]:
lat_m = latencies.mean(axis=0)
var = np.sqrt(latencies.var(axis=0))
In [ ]:
# plot latency evolution and moving avergage of it
smooth = 10
plt.plot(lat_m, "b")
plt.plot(lat_m - var, "r+")
plt.plot(lat_m + var, "r+")
plt.plot(np.convolve(np.ones(smooth) / smooth, lat_m, mode='same'))